# Replication of Known Unknowns: Media Bias in the Study of Political Violence
# Nick Dietrich (dietrich.nicholas@gmail.com)
# This file reproduces the figures and tables in the article and the appendix.
# Developed in R version 3.6.1

# required packages:
#install.packages("randomForest")
#install.packages("mice")
#install.packages("stargazer")

# set random seed for replication purposes:
set.seed(12455)

# set working directory to the folder containing replication materials:
setwd("/Users/nickdietrich/Desktop/known_unknowns_replication")

# importance scores from repeated ntree=1500 random forests
imp.rep <- read.csv("variable_importance.csv")
imp.rep <- imp.rep[,3:ncol(imp.rep)]
implist <- data.frame(var=colnames(imp.rep),varimp=sapply(X=imp.rep,FUN=mean))

# character vectors of independent and dependent variables
ivs <- c("best","year","osv","nonstate","mountains_mean","media_freedom",#"urban_gc","ttime_mean",
         "bdist2","capdist","americas","africa","asia","europe","gcp_ppp_imp",
         "mideast","excluded","polity2","imr_mean",#"statebased","nlights_mean",
         "gdp_pc","ucdp_intl","ucdp_civ","ucdp_intlciv",
         "xrreg","xrcomp","xconst","parreg","parcomp","trade_percent_gdp",
         "v_media_censorship","v_internet_censorship","v_internet_access",
         "v_media_criticism","v_media_diversity","v_journo_harrassment","pop_gpw_sum_imp",
         "v_media_selfcensor","v_mbias_opposition","v_media_corrupt",#"ip4dist",
         "ip6dist","RTI","stateperp")
labs <- c("Best Estimate","Year","One-Sided Violence","Non-State Event",#"Urban Center Travel Time",
          "Mountains","FH Media Freedom","Distance to Border","Distance to Capital",
          "Americas","Africa","Asia","Europe","Grid Cell Economic Output","Middle East",#"Night Lights",
          "Num. Excluded Groups","Polity","Infant Mortality",#"State-Based Event",
          "GDP per Capita","International Conflict",
          "Civil Conflict","Internationalized Civil Conflict","Regulations Exec Recruitment",
          "Competitive Exec Recruitment","Exec Constraints",
          "Regulated Participation","Competitive Participation",
          "Trade (Percent GDP)","Media Censorship (Vdem)","Internet Censorship (Vdem)",
          "Internet Access (Vdem)","Media Criticism (Vdem)","Media Diversity (Vdem)",
          "Journalist Harrasment (Vdem)","Grid Cell Population","Self-Censorship (Vdem)",
          "Media Bias (Vdem)","Media Corruption (Vdem)",
          "Internet Distance (IP6)","Right to Information",
          "State-Perpetrated Event")
dv <- "media"

# sort variable importance scores by varimp score:
imp.rep.sort <- imp.rep[order(implist[,2],decreasing=F)]

###
### Figure 1
###
# Variable importance plot:
pdf("var_imp_main.pdf",height=8,width=8)
par(mar=c(4,13,.1,.1),mfrow=c(1,1))
plot(sort(implist[,2],decreasing=F),1:length(sort(implist[,2],decreasing=F)),main="",
     #main="Media Source (Random Forest of GED Events, 2013-2016)",
     xlab="Importance Score",
     ylab="",yaxt="n",pch=19,type="n",xlim=c(0,max(implist[,2])+.001))
# lines for readability
 for(i in 1:ncol(imp.rep)) {
  # lines for readability:
  #if(i%%2==0){abline(h=i,col="grey95")}
  abline(h=i,col="grey95")
  #points(imp.rep.sort[,i],rep(i,nrow(imp.rep)),col="grey90",pch=20)
  #print(names(impmed.sort)[i])
  lines(quantile(imp.rep.sort[,i],c(.025,.975),na.rm=T),c(i,i),col="grey50",lwd=2)
 }
points(sort(implist[,2],decreasing=F),1:length(sort(implist[,2],decreasing=F)),pch=19)
axis(side=2,at=1:length(sort(implist[,2],decreasing=F)),
     labels=labs[order(implist[,2],decreasing=F)],las=2)
legend(x="bottomright",legend="95% CI",lty=1,col="grey50",lwd=2)
dev.off()


###
### Partial Dependence Plots
###

### partial dependence from repeated ntree=1500 random forests
pd <- read.csv("partial_dependence.csv")
pd <- pd[,3:ncol(pd)]

# define a transparent color to show overlapping lines
grey.special <- rgb(180, 180, 180, max = 255, alpha = 75)

# helper function to plot partial dependence:
pd.plot <- function(df=pd,var,label,binary=F){
  # pd.plot returns a plot of the partial dependence over a range of values
  # includes confidence intervals from repeated random forests
  # df: a data.frame containing the pd values from the random forests
  # var: a character corresponding to the name of a variable in df
  # binary: T for points, F for lines for continuous variables
  sub <- na.omit(df[,c(var,"X1")]) # data frame containing desired var and probability
  tmp <- data.frame(var=unique(sub[,var]))
  tmp$mean <- NA
  #tmp$low <- NA
  #tmp$high <- NA
  # loop through and get the mean values
  for(i in 1:nrow(tmp)){
    tmp$mean[i] <- mean(sub[which(sub[,1]==tmp$var[i]),"X1"])
    #tmp$low[i] <- quantile(sub[which(sub[,1]==tmp$var[i]),"X1"],.25)
    #tmp$high[i] <- quantile(sub[which(sub[,1]==tmp$var[i]),"X1"],.75)
  }
  plot(NA,xlim=c(min(sub[,var])-.1,max(sub[,var])+.1),ylim=c(.73,.98),axes=F,ylab="Pr(Media Source)",
       xlab=label)
  for(i in 1:200){
    iter <- sub[(i*nrow(tmp)-nrow(tmp)+1):(i*nrow(tmp)),]
    lines(iter,col=grey.special)
  }
  if(binary==T){
    points(tmp,pch=19)
    axis(1,c(0,1))
  } else{
    lines(tmp,lwd=2)
    axis(1)
  }
  axis(2)
}

# helper function to plot partial dependence:
pd.diff <- function(df=pd,var){
  # pd.diff.plot returns a numeric vector of the difference in partial dependence for binary variables
  # df: a data.frame containing the pd values from the random forests
  # var: a character corresponding to the name of a variable in df
  sub <- na.omit(df[,c(var,"X1")]) # data frame containing desired var and probability
  
  tmp <- data.frame(var=unique(sub[,var]))
  diff <- numeric(200)
  #tmp$low <- NA
  #tmp$high <- NA
  # loop through and get the mean values
  for(i in 1:200){
    iter <- sub[(i*nrow(tmp)-nrow(tmp)+1):(i*nrow(tmp)),]
    diff[i] <- iter$X1[2]-iter$X1[1]
  }
  return(diff)
}

###
### Plotting
###

# Figure 2:
pdf("pd_region.pdf",width=4.5,height=3)
par(mar=c(4,6,.5,.3))
region <- c("europe","mideast","americas","asia","africa")
region.labs <- c("Europe","Middle East","Americas","Asia","Africa")
plot(rep(0,length(region)),1:length(region),axes=F,type="n",xlab="Difference in Pr(Media Report)",
     ylab="",xlim=c(-.1,.1))
axis(1)
axis(side=2,at=1:length(region),
     labels=region.labs,las=2)
abline(v=0,lty=2)
legend(x="bottomright",legend="95% CI",lty=1,col="grey50",lwd=2)
for(i in 1:length(region)){
  lab <- labs[which(ivs==region[i])]
  this.diff <- pd.diff(df=pd,var=region[i])
  lines(quantile(this.diff,c(.025,.975)),c(i,i),col="grey50",lwd=2)
  points(mean(this.diff),i,pch=19)
}
dev.off()

# Figure 4:
pdf("pd_dist.pdf",width=6.5,height=2.5)
par(mfrow=c(1,3),mar=c(4,3,.1,.3))
pd.plot(df=pd,var="trade_percent_gdp",label="Trade (Percent GDP)",binary=F)
pd.plot(df=pd,var="ip6dist",label="Internet Distance",binary=F)
pd.plot(df=pd,var="polity2",label="Polity",binary=F)
dev.off()

# Figure 5:
pdf("pd_other.pdf",width=6.5,height=2.5)
par(mfrow=c(1,4),mar=c(4,3,.1,.3))
pd.plot(df=pd,var="v_media_censorship",label="Media Censorship (Vdem)",binary=F)
pd.plot(df=pd,var="v_media_corrupt",label="Media Corruption (Vdem)",binary=F)
pd.plot(df=pd,var="media_freedom",label="FH Media Freedom",binary=F)
pd.plot(df=pd,var="osv",label="One-Sided Violence",binary=T)
dev.off()


###
### Appendix
###

# Table 1:
ged <- read.csv("ged_replication.csv")
ged$media <- as.numeric(ged$any_media)
ged <- ged[,c(dv,ivs)]

# impute missing vars
library(mice)
library(randomForest)
ged.imp <- mice(ged,method="rf",m=1,seed=666)

# extract the imputed data:
ged.imp2 <- complete(ged.imp)
# format as numeric:
ged.imp2 <- data.frame(lapply(ged.imp2, function(x) as.numeric(as.character(x))))
# set omitted category for region predictors:
logit.ivs.ordered <- ivs[order(implist[,2],decreasing=T)]
logit.ivs <- logit.ivs.ordered[which(logit.ivs.ordered!="mideast")]
logit.labs <- labs[order(implist[,2],decreasing=T)]
logit.labs <- logit.labs[which(logit.ivs.ordered!="mideast")]


# run logit
logit <- glm(as.formula(paste(dv,paste0(logit.ivs,collapse="+"),sep="~")),family=binomial(link='logit'),
             data=ged.imp2)

# create the table
library(stargazer)
out <- "logit.tex"
star <- stargazer(logit,title="Logistic Regression: Predictors of Media Source",
          covariate.labels=logit.labs,label="tab:diff",no.space=T,align=T,digits=2,
          single.row=T,column.labels="Coefficient (Standard Error)",dep.var.labels = "Pr(Media Source)",
          font.size="footnotesize")
cat(star, sep = '\n', file = out)

###
### Figure 3: proportion media events by country
###
ged <- read.csv("ged_replication.csv",stringsAsFactors=F) #Read in data w/ sources coded

# remove india and syria
ged <- ged[which((ged$country_id!=750)&(ged$country_id!=652)),]
ged <- ged[which(ged$year>=2013),]

# remove unused vars to save memory
ged <- ged[,c(dv,ivs,"country.x","country_id")]

# table of media vs. non-media sources by country
sourcetab <- table(ged$country.x,ged$any_media)

# get countries with >100 observations
sourcetab <- sourcetab[which((sourcetab[,1]+sourcetab[,2])>=100),]

# order by proportion of media stories
sourcetab <- sourcetab[order(sourcetab[,2]/(sourcetab[,1]+sourcetab[,2])),]
sourcetab <- sourcetab[order(rev(1:nrow(sourcetab))),]

# country labels:
labs <- rownames(sourcetab)
labs[7] <- "Yemen"
labs[15] <- "Russia"
labs[21] <- "DR Congo"

# plot
pdf("media_proportion.pdf",width=6.5,height=5.25)
par(mar=c(4,9,.1,.5))
plot(rep(NA,nrow(sourcetab)),1:nrow(sourcetab),xlim=c(0,1),axes=F,ylab="",
     xlab="Proportion Media-Reported Events")
axis(1)
axis(2,1:nrow(sourcetab),labs,tick=F,las=1,pos=.03)
# add 0-1 polygon for each country, then add media prop polygon over it
for(i in 1:nrow(sourcetab)){
  rect(0,i-10/nrow(sourcetab),1,i+10/nrow(sourcetab))
  rect(0,i-10/nrow(sourcetab),sourcetab[i,2]/(sourcetab[i,1]+sourcetab[i,2]),i+10/nrow(sourcetab),
       col="grey85")#density=30,lwd=.5,col="grey85"
}
dev.off()
